Basic R

Cast dates

start_date <- "2017-01-01"
end_date <- "2019-12-31"
f1<-function(d2, d1){
  n_weeks <-  floor(as.numeric(difftime(d2, d1, units="weeks")))
}
f2<-function(d2, d1){
  n_weeks <- floor(as.numeric(difftime(as.Date(d2)
    , as.Date(d1), units = "weeks")))
}
m1<-microbenchmark(
  Nocast = f1(end_date, start_date),
  Cast = f2(end_date, start_date),
  times = 1000
)
print(m1)
## Unit: microseconds
##    expr     min       lq     mean   median      uq      max neval
##  Nocast 368.988 376.9135 401.1525 380.7910 392.202 4086.296  1000
##    Cast 125.033 129.4370 137.7458 130.9895 134.892 2715.518  1000
fbox_plot(m1, "microseconds")

Explicit vector length vector(“type”, length) is faster than an empty vector c()

no_size <- function (n){
  x <- c()
  for (i in seq(n)) {
    x <- c(x, i)
  }
}
explicit_size <- function (n){
  x <- vector("integer", n)
  for (i in seq(n)) {
    x[i] <- i
  }
}
m3 <- microbenchmark(
  no_size = no_size(1e4),
  explicit_size = explicit_size(1e4),
  times = 10
)
print(m3)
## Unit: microseconds
##           expr       min       lq       mean    median        uq        max
##        no_size 68810.468 69096.43 74809.0331 70898.133 74436.928 101232.712
##  explicit_size   327.411   331.95   659.0558   366.073   371.934   3217.043
##  neval
##     10
##     10
fbox_plot(m3, "microseconds")

which function is slow for some simple situations

vector <- runif(1e8)
w1 <- function(x){
  d <- length(which(x > .5))
}
w2 <- function(x){
  d <- sum(x > .5)
}

m4 <- microbenchmark(
  which = w1(vector),
  nowhich = w2(vector),
  times = 10
)
print(m4)
## Unit: milliseconds
##     expr      min       lq     mean   median       uq      max neval
##    which 628.4986 630.2693 662.1551 631.4521 633.1820 850.1048    10
##  nowhich 218.0632 220.0816 222.1767 223.0102 223.3368 225.1237    10
fbox_plot(m4, "miliseconds")

Column operation is faster than row operation

n <- 1e4
dt <- data.table(
  a = seq(n), b = runif(n)
)
v1 <- function(dt){
  d <- mean(dt[dt$b > .5, ]$a)
}
v2 <- function(dt){
  d <- mean(dt$a[dt$b > .5])
}
m5 <- microbenchmark(
  row_operation = v1(dt),
  column_operation = v2(dt),
  times = 10
)
print(m5)
## Unit: microseconds
##              expr     min      lq     mean   median      uq      max neval
##     row_operation 193.361 196.326 971.7363 210.4925 248.074 5700.458    10
##  column_operation  73.026  78.676 335.9857  82.7335  89.406 2512.659    10
fbox_plot(m5, "microseconds")

Sequences function safer than 1:n

The function seq prevents when the second part of the 1:x is zero

num <- 1e7
s1 <- function(num){
  d <- mean(1:num)
}
s2 <- function(num){
  d <- mean(seq(num))
}
m6<-microbenchmark(
  noseq = s1(num),
  seq = s2(num),
  times = 30
)
print(m6)
## Unit: milliseconds
##   expr      min       lq     mean   median       uq      max neval
##  noseq 69.69519 69.97604 70.04254 69.99300 70.01719 71.82410    30
##    seq 69.61423 69.98628 70.06606 69.99879 70.03862 71.64366    30
fbox_plot(m6, "miliseconds")

paste0 is faster than glue

large_dataset <- data.table(
  id = 1:1000000,
  value = sample(letters, 1000000, replace = TRUE)
)
a1 <- function(x){
  d <- x |> mutate(code = paste0(id, "_", value))
}
a2 <- function(x){
  d <- x |> mutate(code = glue("{id}_{value}"))
}
m7 <- microbenchmark(
  with_paste = a1(large_dataset),
  with_glue = a2(large_dataset),
  times = 20
)
print(m7)
## Unit: milliseconds
##        expr      min       lq     mean   median       uq      max neval
##  with_paste 562.4149 571.3529 580.9086 576.0443 581.9039 666.3175    20
##   with_glue 585.6863 589.0650 612.7230 590.4762 594.3041 998.0038    20
fbox_plot(m7, "miliseconds")

for loop vs lapply

# Create a large list
big_list <- replicate(1e5, rnorm(10), simplify = FALSE)

m8 <- microbenchmark(
  lapply = lapply(big_list, mean),
  for_loop = {
    result <- list()
    for (i in seq_along(big_list)) {
      result[[i]] <- mean(big_list[[i]])
    }
  },
  times = 10
)

print(m8)
## Unit: milliseconds
##      expr      min       lq     mean   median       uq      max neval
##    lapply 316.7702 319.2393 335.3730 331.7764 348.6938 363.4554    10
##  for_loop 345.7628 350.9287 386.2824 368.5049 401.2409 519.6330    10
fbox_plot(m8, "miliseconds")

data.table package functions

Date vs IDate

dt <- data.table(
  Date = as.Date('2023-01-01') + 0:99999,
  iDate = as.IDate('2023-01-01') + 0:99999,
  Value = rnorm(100000)
)

nd1 <- as.Date('2023-01-01')
nd2 <- as.Date('2023-01-10')
id1 <- as.IDate('2023-01-01')
id2 <- as.IDate('2023-01-10')

date_between_operation <- function(nd1, nd2) {
  dt |> filter(Date >= nd1 & Date <= nd2)
}
idate_between_operation <- function(id1, id2) {
  dt |> _[data.table::between(iDate, id1, id2)]
}

m9 <- microbenchmark(
  Date = date_between_operation(nd1, nd2),
  iDate = idate_between_operation(id1, id2),
  times = 200L
)
print(m9)
## Unit: microseconds
##   expr      min       lq      mean    median        uq       max neval
##   Date 1459.513 1515.393 1893.2597 1753.6175 2020.3000 11772.369   200
##  iDate  570.534  611.176  721.0323  632.4755  798.3795  2240.191   200
fbox_plot(m9, "miliseconds")

Base R switch vs Dplyr case_when (for simple tasks)

switch_function <- function(x) {
  switch(x,
         "a" = "apple",
         "b" = "banana",
         "c" = "cherry",
         "default")
}
case_when_function <- function(x) {
  case_when(
    x == "a" ~ "apple",
    x == "b" ~ "banana",
    x == "c" ~ "cherry",
    TRUE ~ "default"
  )
}
# Create a vector of test values
test_values <- sample(c("a", "b", "c", "d"), 1000, replace = TRUE)
m10 <- microbenchmark(
  switch = sapply(test_values, switch_function),
  case_when = sapply(test_values, case_when_function),
  times = 200L
)
print(m10)
## Unit: microseconds
##       expr        min          lq        mean    median         uq       max
##     switch    633.532    647.7385    678.8005    656.17    673.312   2165.25
##  case_when 229197.459 243041.3725 246470.9864 245410.62 249228.503 387248.51
##  neval
##    200
##    200
fbox_plot(m10, "microseconds")

data.table fcase vs Dplyr case_when

set.seed(123)
n <- 1e6
data <- data.table(
  id = seq(n),
  value = sample(seq(100), n, replace = TRUE)
)

casewhenf <- function(data){
  df <- data |> 
    mutate(category = case_when(
      value <= 20 ~ "Low",
      value <= 70 ~ "Medium",
      value > 70 ~ "High"))
}
fcasef <- function(data){
  df <- data |> 
    mutate(category = fcase(
      value <= 20, "Low",
      value <= 70, "Medium",
      value > 70, "High"))
}
m11 <- microbenchmark(
  case_when = casewhenf(data),
  fcase = fcasef(data),
  times = 20
)
print(m11)
## Unit: milliseconds
##       expr      min       lq     mean   median       uq      max neval
##  case_when 56.36157 56.91179 60.62484 61.08828 62.99643 73.42582    20
##      fcase 20.66431 20.88550 24.21154 21.55173 23.36648 46.32106    20
fbox_plot(m11, "miliseconds")

data.table fcoalesce vs tidyr replace_na

set.seed(123)
DT <- data.table(
  ID = 1:1e6,
  Value1 = sample(c(NA, 1:100), 1e6, replace = TRUE),
  Value2 = sample(c(NA, 101:200), 1e6, replace = TRUE)
)

# Define the functions
replace_na_f <- function(data){
  DF <- data |> 
    mutate(Value1 = replace_na(Value1, 0),
           Value2 = replace_na(Value2, 0)) |> 
    as.data.table()
}
fcoalesce_f <- function(data){
  DF <- data |> 
    mutate(Value1 = fcoalesce(Value1, 0L),
           Value2 = fcoalesce(Value2, 0L))
}
m12 <- microbenchmark(
  treplace_na = replace_na_f(DT),
  tfcoalesce = fcoalesce_f(DT),
  times = 20
)
print(m12)
## Unit: milliseconds
##         expr      min       lq     mean   median       uq      max neval
##  treplace_na 7.388027 7.471763 7.979352 7.718609 8.288047 10.21760    20
##   tfcoalesce 1.556244 1.731797 2.867601 1.879382 2.405153 18.01402    20
fbox_plot(m12, "miliseconds")

data.table notation vs dplyr notation

dt <- data.table(field_name = c("argentina.blue.man.watch", 
                                "brazil.red.woman.shoes", 
                                "canada.green.kid.hat", 
                                "denmark.red.man.shirt"))

# Filter rows where 'field_name' does not contain 'red'
dtnot <- function(data){
  filtered_dt <- data |> _[!grepl("red", field_name)]
}
anonymousnot <- function(data){
  filtered_dt <- data |> (\(dt) dt[!grepl("red", dt$field_name), ])()
}
dplyrnot <- function(data){
  filtered_dt <- data |> filter(!grepl("red", field_name))
}

m13 <- microbenchmark(
  anonymous_not = anonymousnot(dt),
  data_table_not = dtnot(dt),
  dplyr_not = dplyrnot(dt),
  times = 100
)
print(m13)
## Unit: microseconds
##            expr     min       lq     mean  median       uq      max neval
##   anonymous_not 103.544 110.6565 157.4315 121.111 142.1450 3050.883   100
##  data_table_not 100.528 106.5985 140.5623 115.831 137.9675 1811.130   100
##       dplyr_not 683.585 702.4205 772.8657 723.791 747.5755 3021.749   100
fbox_plot(m13, "microseconds")

data.table melt vs tidyr pivot_longer

large_data <- data.table(
  id = 1:100000,
  var1 = rnorm(100000),
  var2 = rnorm(100000),
  var3 = rnorm(100000),
  var4 = rnorm(100000)
)
# Benchmarking
m14 <- microbenchmark(
  tidyr_pivot_longer = {
    long_data_tidyr <- pivot_longer(large_data, cols = starts_with("var"), 
                                    names_to = "variable", values_to = "value")
  },
  data_table_melt = {
    long_data_dt <- melt(large_data, id.vars = "id", variable.name = "variable", 
                         value.name = "value")
  },
  times = 10
)

print(m14)
## Unit: microseconds
##                expr      min       lq      mean    median       uq       max
##  tidyr_pivot_longer 6639.901 6892.312 9591.4079 7025.8465 7106.542 33021.823
##     data_table_melt  407.090  496.948  581.8058  572.0675  709.063   801.385
##  neval
##     10
##     10
fbox_plot(m14, "microseconds")

data.table CJ vs tidyr expand_grid

vec1 <- seq(1000)
vec2 <- seq(1000)

# Define functions to be benchmarked
expand_grid_func <- function() {
  return(expand_grid(vec1, vec2))
}

CJ_func <- function() {
  return(CJ(vec1, vec2))
}

# Perform benchmarking
m15 <- microbenchmark(
  expand_grid = expand_grid_func(),
  CJ = CJ_func(),
  times = 10
)

print(m15)
## Unit: microseconds
##         expr      min       lq      mean   median       uq      max neval
##  expand_grid 2219.372 2261.751 2521.4544 2351.548 2386.073 3759.155    10
##           CJ  455.861  463.785  649.7245  478.208  497.789 1836.868    10
fbox_plot(m15, "microseconds")

data.table rbindlist vs R rbind

# Sample data
size = 1e4
set.seed(44)
df_list <- replicate(50, data.table(id = sample(seq(size), size, replace = T),
                                    value = rnorm(size)), simplify = F)

simple_bind <- function(list_of_dfs){
  do.call(rbind, list_of_dfs)
}

dplyr_bind <- function(list_of_dfs){
  bind_rows(list_of_dfs)
}

dt_bind <- function(list_of_dfs){
  rbindlist(list_of_dfs, fill = F)
}

# Benchmark both methods
m16 <- microbenchmark(
  dt_ver = dt_bind(df_list),
  simple = simple_bind(df_list),
  dplyr_ver = dplyr_bind(df_list),
  times = 30
)

print(m16)
## Unit: microseconds
##       expr       min        lq       mean     median        uq       max neval
##     dt_ver   431.115   470.739   586.4407   512.6815   588.047  1976.799    30
##     simple   488.862   519.830   611.7507   563.7670   623.764  1884.857    30
##  dplyr_ver 10111.600 10384.979 10927.4029 10476.8610 10648.181 21415.835    30
fbox_plot(m16, "microseconds")

stringr word vs tidyr separate vs data.table tstrsplit

set.seed(123)
n <- 1e4
df <- data.table(text = paste("word1", "word2", "word3", "word4", "word5", sep = "."), stringsAsFactors = F)
df <- df[rep(1, n), , drop = F]

# Using tidyr::separate
separate_words <- function() {
  df |> 
    separate(text, into = c("w1", "w2", "w3", "w4", "w5"), sep = "\\.", remove = F) |> 
    select(-c(w1, w2, w4))
}

# Using stringr::word
stringr_words <- function() {
  df |> 
    mutate(
      w3 = word(text, 3, sep = fixed(".")),
      w5 = word(text, 5, sep = fixed("."))
    )
}

datatable_words <- function() {
  df |> _[, c("w3", "w5") := tstrsplit(text, "\\.")[c(3, 5)]]
}

m17 <- microbenchmark(
  separate = separate_words(),
  stringr = stringr_words(),
  dt = datatable_words(),
  times = 10
)

print(m17)
## Unit: milliseconds
##      expr       min        lq      mean    median        uq       max neval
##  separate  79.83310  83.29129  94.29407  94.72704  97.69480 117.03869    10
##   stringr 181.41822 185.22161 215.12042 211.97258 230.81153 288.14400    10
##        dt  12.30277  12.43595  12.91484  12.54286  12.65337  15.33692    10
fbox_plot(m17, "miliseconds")

data.table na_omit vs dplyr drop_na

# Sample data
set.seed(123)
n <- 1e6
df <- data.table(
  x = rnorm(n),
  y = sample(c(NA, 1:100), n, replace = TRUE),
  z = sample(c(NA, letters), n, replace = TRUE),
  stringsAsFactors = F
)

# Benchmark both methods
m18 <- microbenchmark(
  dplyr_drop_na = {
    df |> drop_na()
  },
  data_table_na_omit = {
    dt |> na.omit()
  },
  times = 10
)

print(m18)
## Unit: microseconds
##                expr      min       lq      mean   median       uq       max
##       dplyr_drop_na 9365.868 9378.702 9593.9516 9417.189 9705.151 10220.052
##  data_table_na_omit    8.756    9.097   51.8919   60.708   62.968   172.432
##  neval
##     10
##     10
fbox_plot(m18, "microseconds")

Parallel processing

lapply vs parallel mclapply

# Sample data
set.seed(123)

size = 1e4
n_cores = parallelly::availableCores()

df_list <- replicate(100, data.table(id = sample(seq(size), size, replace = T),
                                    value = rnorm(size)), simplify = F)
extra_df <- data.table(id = sample(seq(size), size, replace = T), 
                       extra_value = runif(size))

# Sequential join
sequential_join <- function() {
  lapply(df_list, function(df) {
    merge(df, extra_df, by = "id", allow.cartesian = T)
  })
}

# Parallel join using mclapply
parallel_join <- function() {
  mclapply(df_list, function(df) {
    merge(df, extra_df, by = "id", allow.cartesian = T)
  }, mc.cores = n_cores, mc.silent = T, mc.cleanup = T)
}

# Benchmark both methods
m19 <- microbenchmark(
  sequential = sequential_join(),
  parallel = parallel_join(),
  times = 10
)

print(m19)
## Unit: milliseconds
##        expr      min       lq     mean   median       uq      max neval
##  sequential 593.2165 603.7509 655.2140 632.6488 693.8236 823.1743    10
##    parallel 222.2954 241.5792 254.6547 248.7249 255.6517 325.9662    10
fbox_plot(m19, "miliseconds")

dtplyr

This is another alternative (You need to install this package)

set.seed(123)
n <- 1e7
df <- data.table(
  group1 = sample(LETTERS[1:10], n, replace = TRUE),
  group2 = sample(letters[1:5], n, replace = TRUE),
  value1 = rnorm(n),
  value2 = runif(n, 1, 100)
)

m21 <- microbenchmark(
  basic_way = {
    dplyr <- df |> 
      filter(value1 > 0) |> 
      mutate(ratio = value1 / value2) |> 
      summarize(
        mean_val1 = mean(value1),
        sd_val1 = sd(value1),
        median_val2 = median(value2),
        max_ratio = max(ratio), .by = c("group1", "group2")) |> 
      as.data.table()
  },
  dtplyr_way = {
    dtplyr = df |> 
      lazy_dt() |> 
      filter(value1 > 0) |> 
      mutate(ratio = value1 / value2) |> 
      summarize(
        mean_val1 = mean(value1),
        sd_val1 = sd(value1),
        median_val2 = median(value2),
        max_ratio = max(ratio), .by = c("group1", "group2")) |> 
      as.data.table()
  },
  times = 5
)

print(m21)
## Unit: milliseconds
##        expr      min       lq     mean   median       uq      max neval
##   basic_way 515.0258 553.2352 611.6810 648.2386 659.3700 682.5351     5
##  dtplyr_way 622.0624 640.6803 701.3499 657.3354 766.8708 819.8007     5
fbox_plot(m21, "miliseconds")

Duckdb

Dockdb files vs parquet

with_parquet <- function(){
  fp_data <- "/conf/posit_azure_logs/data"
  
  data_1 <- open_dataset(file.path(glue::glue("{fp_data}/golden_data_in_progress"))) |>
    select(
      date, hours, time,
      ALL_WIP_CP_day_session, ALL_WIP_CP_night_session,
      ALL_WIP_BP_day_session, ALL_WIP_BP_night_session,
      ALL_WIP_CP_DS_mem_limit, ALL_WIP_CP_NS_mem_limit,
      ALL_WIP_BP_DS_mem_limit, ALL_WIP_BP_NS_mem_limit,
      ALL_WIP_CP_DS_mem_request, ALL_WIP_CP_NS_mem_request,
      ALL_WIP_BP_DS_mem_request, ALL_WIP_BP_NS_mem_request,
      ALL_WIP_CP_DS_mem_max, ALL_WIP_CP_NS_mem_max,
      ALL_WIP_BP_DS_mem_max, ALL_WIP_BP_NS_mem_max,
      ALL_WIP_CP_node_total, ALL_WIP_BP_node_total
    ) |>
    mutate(
      computepool_node_mem = ALL_WIP_CP_node_total * (160 * 1024),
      bigpool_node_mem = ALL_WIP_BP_node_total * (256 * 1024),
      ALL_WIP_day_session = ALL_WIP_CP_day_session + ALL_WIP_BP_day_session,
      ALL_WIP_night_session = ALL_WIP_CP_night_session + ALL_WIP_BP_night_session,
      ALL_WIP_node_total = ALL_WIP_CP_node_total + ALL_WIP_BP_node_total,
      total_mem_limit = ALL_WIP_CP_DS_mem_limit + ALL_WIP_CP_NS_mem_limit + ALL_WIP_BP_DS_mem_limit + ALL_WIP_BP_NS_mem_limit,
      total_mem_request = ALL_WIP_CP_DS_mem_request + ALL_WIP_CP_NS_mem_request + ALL_WIP_BP_DS_mem_request + ALL_WIP_BP_NS_mem_request,
      total_mem_max = ALL_WIP_CP_DS_mem_max + ALL_WIP_CP_NS_mem_max + ALL_WIP_BP_DS_mem_max + ALL_WIP_BP_NS_mem_max,
      total_node_mem = computepool_node_mem + bigpool_node_mem,
      average_session_per_node = ifelse(ALL_WIP_node_total != 0,
                                         (ALL_WIP_day_session + ALL_WIP_night_session) / ALL_WIP_node_total, 0)
    ) |>
    collect() |>
    as.data.table()
}

with_duckfile <- function(){
  file.copy("/conf/posit_azure_logs/gatzos01/gd_inprogress.duckdb", "gd_inprogress.duckdb")
  
  data_2 <- res_duckdb_sql <- dbGetQuery(
    conn = dbConnect(duckdb::duckdb(), dbdir = "./gd_inprogress.duckdb"),
    statement = glue("select date, hours, time,
        ALL_WIP_CP_day_session, ALL_WIP_CP_night_session,
        ALL_WIP_BP_day_session, ALL_WIP_BP_night_session,
        ALL_WIP_CP_DS_mem_limit, ALL_WIP_CP_NS_mem_limit,
        ALL_WIP_BP_DS_mem_limit, ALL_WIP_BP_NS_mem_limit,
        ALL_WIP_CP_DS_mem_request, ALL_WIP_CP_NS_mem_request,
        ALL_WIP_BP_DS_mem_request, ALL_WIP_BP_NS_mem_request,
        ALL_WIP_CP_DS_mem_max, ALL_WIP_CP_NS_mem_max,
        ALL_WIP_BP_DS_mem_max, ALL_WIP_BP_NS_mem_max,
        ALL_WIP_CP_node_total, ALL_WIP_BP_node_total,
        ALL_WIP_CP_node_total * 160 * 1024 as computepool_node_mem,
        ALL_WIP_BP_node_total * 256 * 1024 as bigpool_node_mem,
        ALL_WIP_CP_day_session + ALL_WIP_BP_day_session as ALL_WIP_day_session,
        ALL_WIP_CP_night_session + ALL_WIP_BP_night_session as ALL_WIP_night_session,
        ALL_WIP_CP_node_total + ALL_WIP_BP_node_total as ALL_WIP_node_total,
        ALL_WIP_CP_DS_mem_limit + ALL_WIP_CP_NS_mem_limit + ALL_WIP_BP_DS_mem_limit + ALL_WIP_BP_NS_mem_limit as total_mem_limit,
        ALL_WIP_CP_DS_mem_request + ALL_WIP_CP_NS_mem_request + ALL_WIP_BP_DS_mem_request + ALL_WIP_BP_NS_mem_request as total_mem_request,
        ALL_WIP_CP_DS_mem_max + ALL_WIP_CP_NS_mem_max + ALL_WIP_BP_DS_mem_max + ALL_WIP_BP_NS_mem_max as total_mem_max,
        computepool_node_mem + bigpool_node_mem as total_node_mem,
        CASE 
          WHEN ALL_WIP_node_total != 0 THEN (ALL_WIP_day_session + ALL_WIP_night_session) / ALL_WIP_node_total
          ELSE 0
        END AS average_session_per_node
      from gdinprog"),
    immediate = TRUE) |> 
    as.data.table()
  
  file.remove("./gd_inprogress.duckdb")
}

m22 <- microbenchmark(
  with_parquet = with_parquet(),
  with_duckfile = with_duckfile(),
  times = 3
)

print(m22)
## Unit: milliseconds
##           expr       min         lq       mean     median       uq       max
##   with_parquet 10369.535 18304.6670 21565.5895 26239.7988 27163.62 28087.434
##  with_duckfile   501.711   644.0689   927.7635   786.4268  1140.79  1495.153
##  neval
##      3
##      3
fbox_plot(m22, "miliseconds")